import urllib.request
import re

class GetData:
    #构造函数
    def __init__(self):
        self.headers = ("User-Agent",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36")  # header
    #获取网页内容
    def getHtmlData(self,url):
        try:
            opener = urllib.request.build_opener()
            opener.addheaders = [self.headers]
            return opener.open(url).read()
        except Exception as err:
            print(err)

    #正则获取需要内容
    def getResult(self,pst,strData):
        result=re.compile(pst,re.S).findall(strData)
        return result

    #写文件
    def saveData(self,fileFullName,fileContent):
        file=open(fileFullName,"wb")
        file.write(fileContent)
        file.close()

def main():
    gd=GetData()

    #爬取首页
    pst1 = '<a strategy=(.*?)</a>'
    data=gd.getHtmlData("http://blog.csdn.net")
    firstPageData=gd.getResult(pst1,data.decode("UTF-8")) #爬取首页文章列表，通过正则解析

    #循环抓取每一文章
    for i in range(0,len(firstPageData)):
        url2=gd.getResult('href="(.*)" target="_blank">',firstPageData[i])  #文章地址
        fileName="/Users/ywj/Desktop/"+str(i+1)+".html" #保存文件名
        gd.saveData(fileName,gd.getHtmlData(url2[0])) #保存文件到本地

if __name__ == '__main__':
    main()
